Step 1: Importing Required Libraries¶

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline 
plt.style.use(['bmh']) # bmh Styling used for Visulization

Step2: Reading the CSV File¶

In [2]:
df = pd.read_csv("MOCK_DATA csv.csv")
print ('Data read into a pandas dataframe!')
Data read into a pandas dataframe!

Step3: Analyzing Dataset¶

In [65]:
df.head() #Calling first 5 rows
Out[65]:
id first_name last_name email gender ip_address Car Colour company
0 1 Miguelita Ticic mticic0@ted.com Female 45.216.118.239 Honda Pink Hagenes, Hayes and Roob
1 2 Cami Sibery csibery1@nationalgeographic.com Female 193.140.90.251 Mazda Crimson Huels LLC
2 3 Melany Iskower miskower2@latimes.com Female 146.18.2.124 Hyundai Teal Hessel-Lebsack
3 4 Eb Shurman eshurman3@europa.eu Male 37.3.25.144 Land Rover Purple Walker-Bogisich
4 5 Emery Springett espringett4@livejournal.com Male 15.31.162.5 Pontiac Aquamarine Nader, Lynch and Wiza
In [66]:
df.dtypes #Checking Datatypes
Out[66]:
id             int64
first_name    object
last_name     object
email         object
gender        object
ip_address    object
Car           object
Colour        object
company       object
dtype: object
In [67]:
df.info(verbose=False) #Information about the dataframe
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 9 entries, id to company
dtypes: int64(1), object(8)
memory usage: 70.4+ KB
In [68]:
df.isnull().sum()
Out[68]:
id            0
first_name    0
last_name     0
email         0
gender        0
ip_address    0
Car           0
Colour        0
company       0
dtype: int64
In [69]:
df.corr() #Checking Correlation between columns
Out[69]:
id
id 1.0

To view the dimensions of the dataframe, we use the .shape parameter.

In [70]:
df.shape # size of dataframe (rows, columns)  
Out[70]:
(1000, 9)

Step 4: Task 1(Creating a Visual to show Top 10 Vehicle Ownership According to Gender)¶

In [71]:
df.columns
Out[71]:
Index(['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address', 'Car',
       'Colour', 'company'],
      dtype='object')
In [79]:
df1 = df[['gender','Car','Colour']] #creating new dataframe
df1
Out[79]:
gender Car Colour
0 Female Honda Pink
1 Female Mazda Crimson
2 Female Hyundai Teal
3 Male Land Rover Purple
4 Male Pontiac Aquamarine
... ... ... ...
995 Male Mazda Maroon
996 Male Buick Pink
997 Female Chevrolet Crimson
998 Female Volkswagen Violet
999 Male Jeep Blue

1000 rows × 3 columns

dummies = pd.get_dummies(df1.Car) merged = pd.concat([df1, dummies], axis='columns') merged = merged.drop(['Car'], axis='columns') merged

In [80]:
df1 = df1[(df1.gender == "Male") | (df1.gender == "Female")]
df1
Out[80]:
gender Car Colour
0 Female Honda Pink
1 Female Mazda Crimson
2 Female Hyundai Teal
3 Male Land Rover Purple
4 Male Pontiac Aquamarine
... ... ... ...
995 Male Mazda Maroon
996 Male Buick Pink
997 Female Chevrolet Crimson
998 Female Volkswagen Violet
999 Male Jeep Blue

902 rows × 3 columns

In [82]:
df2 = df1.groupby(['gender','Colour'],as_index=False).count() #Using groupby function to create required pivot table
df2 
Out[82]:
gender Colour Car
0 Female Aquamarine 37
1 Female Blue 20
2 Female Crimson 20
3 Female Fuscia 24
4 Female Goldenrod 24
5 Female Green 27
6 Female Indigo 29
7 Female Khaki 17
8 Female Maroon 23
9 Female Mauv 21
10 Female Orange 37
11 Female Pink 25
12 Female Puce 17
13 Female Purple 14
14 Female Red 24
15 Female Teal 22
16 Female Turquoise 26
17 Female Violet 20
18 Female Yellow 28
19 Male Aquamarine 25
20 Male Blue 25
21 Male Crimson 23
22 Male Fuscia 23
23 Male Goldenrod 29
24 Male Green 25
25 Male Indigo 21
26 Male Khaki 25
27 Male Maroon 26
28 Male Mauv 22
29 Male Orange 25
30 Male Pink 29
31 Male Puce 23
32 Male Purple 22
33 Male Red 21
34 Male Teal 20
35 Male Turquoise 21
36 Male Violet 15
37 Male Yellow 27
In [108]:
grouped_pivot = df2.pivot(index='Colour',columns='gender',values='Car') #Creating Pivot Table
grouped_pivot.head()
Out[108]:
gender Female Male
Colour
Aquamarine 37 25
Blue 20 25
Crimson 20 23
Fuscia 24 23
Goldenrod 24 29
In [114]:
grouped_pivot.plot(kind='bar',figsize=(15,10),color=['pink','b'])

plt.title('Color of Cars and Their Ownership by Gender',fontsize = 20)
plt.xlabel('Colour',fontsize = 15)
plt.ylabel('Gender',fontsize = 15)
plt.legend(fontsize = 10)
plt.show()
In [106]:
df3 = df1.groupby(['gender','Car'],as_index=False).count() #Using groupby function to create required pivot table
df3.head()
Out[106]:
gender Car Colour
0 Female Acura 16
1 Female Aston Martin 5
2 Female Audi 12
3 Female BMW 20
4 Female Bentley 2
In [105]:
grouped_pivot2 = df3.pivot(index='Car',columns='gender',values='Colour') #Creating Pivot Table
grouped_pivot2.head()
Out[105]:
gender Female Male
Car
Acura 16.0 7.0
Alfa Romeo NaN 2.0
Aston Martin 5.0 1.0
Audi 12.0 15.0
BMW 20.0 14.0
In [115]:
grouped_pivot2.plot(kind='bar',stacked = False,figsize=(25,10))

plt.title('Car Model and Their Ownership by Gender',fontsize = 20)
plt.xlabel('Car Model',fontsize = 15)
plt.ylabel('Gender Count',fontsize = 15)
plt.legend(fontsize = 10)
plt.show()

Step 6: Task 3(Seeing the Correlation of Male Column with Other Features)¶

In [116]:
df.columns
Out[116]:
Index(['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address', 'Car',
       'Colour', 'company'],
      dtype='object')
In [132]:
df4 = df[['gender','Car','Colour','company']] 
In [137]:
df5 = df4.apply(lambda x: x.factorize()[0]).corr()
df5
Out[137]:
gender Car Colour company
gender 1.000000 -0.041169 0.010857 0.026105
Car -0.041169 1.000000 0.026560 0.050811
Colour 0.010857 0.026560 1.000000 -0.043856
company 0.026105 0.050811 -0.043856 1.000000
In [139]:
plt.figure(figsize=(4, 10))
heatmap = sns.heatmap(df5[['company']][0:].sort_values(by='company', ascending=False), vmin=-1, vmax=1, annot=True, cmap='RdBu')
heatmap.set_title('Features and their Correlation with company Column', fontdict={'fontsize':18}, pad=16)
#plt.savefig('Male.png', dpi=300, bbox_inches='tight')
Out[139]:
Text(0.5, 1.0, 'Features and their Correlation with company Column')